Geographic Analysis

Introduction

North American Industry Classification System

import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np

np.random.seed(42)

pio.renderers.default = "notebook"
## Listing Columns So We Can Reference them in Visuals

import pandas as pd
df = pd.read_csv("./data/lightcast_job_postings.csv")
df.head()
/tmp/ipykernel_4047/736325668.py:4: DtypeWarning:

Columns (19,30) have mixed types. Specify dtype option on import or set low_memory=False.

/tmp/ipykernel_4047/736325668.py:4: DtypeWarning:

Columns (19,30) have mixed types. Specify dtype option on import or set low_memory=False.
ID LAST_UPDATED_DATE LAST_UPDATED_TIMESTAMP DUPLICATES POSTED EXPIRED DURATION SOURCE_TYPES SOURCES URL ... NAICS_2022_2 NAICS_2022_2_NAME NAICS_2022_3 NAICS_2022_3_NAME NAICS_2022_4 NAICS_2022_4_NAME NAICS_2022_5 NAICS_2022_5_NAME NAICS_2022_6 NAICS_2022_6_NAME
0 1f57d95acf4dc67ed2819eb12f049f6a5c11782c 9/6/2024 2024-09-06 20:32:57.352 Z 0.0 6/2/2024 6/8/2024 6.0 [\n "Company"\n] [\n "brassring.com"\n] [\n "https://sjobs.brassring.com/TGnewUI/Sear... ... 44.0 Retail Trade 441.0 Motor Vehicle and Parts Dealers 4413.0 Automotive Parts, Accessories, and Tire Retailers 44133.0 Automotive Parts and Accessories Retailers 441330.0 Automotive Parts and Accessories Retailers
1 0cb072af26757b6c4ea9464472a50a443af681ac 8/2/2024 2024-08-02 17:08:58.838 Z 0.0 6/2/2024 8/1/2024 NaN [\n "Job Board"\n] [\n "maine.gov"\n] [\n "https://joblink.maine.gov/jobs/1085740"\n] ... 56.0 Administrative and Support and Waste Managemen... 561.0 Administrative and Support Services 5613.0 Employment Services 56132.0 Temporary Help Services 561320.0 Temporary Help Services
2 85318b12b3331fa490d32ad014379df01855c557 9/6/2024 2024-09-06 20:32:57.352 Z 1.0 6/2/2024 7/7/2024 35.0 [\n "Job Board"\n] [\n "dejobs.org"\n] [\n "https://dejobs.org/dallas-tx/data-analys... ... 52.0 Finance and Insurance 524.0 Insurance Carriers and Related Activities 5242.0 Agencies, Brokerages, and Other Insurance Rela... 52429.0 Other Insurance Related Activities 524291.0 Claims Adjusting
3 1b5c3941e54a1889ef4f8ae55b401a550708a310 9/6/2024 2024-09-06 20:32:57.352 Z 1.0 6/2/2024 7/20/2024 48.0 [\n "Job Board"\n] [\n "disabledperson.com",\n "dejobs.org"\n] [\n "https://www.disabledperson.com/jobs/5948... ... 52.0 Finance and Insurance 522.0 Credit Intermediation and Related Activities 5221.0 Depository Credit Intermediation 52211.0 Commercial Banking 522110.0 Commercial Banking
4 cb5ca25f02bdf25c13edfede7931508bfd9e858f 6/19/2024 2024-06-19 07:00:00.000 Z 0.0 6/2/2024 6/17/2024 15.0 [\n "FreeJobBoard"\n] [\n "craigslist.org"\n] [\n "https://modesto.craigslist.org/sls/77475... ... 99.0 Unclassified Industry 999.0 Unclassified Industry 9999.0 Unclassified Industry 99999.0 Unclassified Industry 999999.0 Unclassified Industry

5 rows × 131 columns

# Filter for Boston, MA and Austin, TX
selected_state = ['California', 'Florida', 'Massachusetts', 'Texas', 'New York']
filtered_df = df[df['STATE_NAME'].isin(selected_state)]

# Further filter for NAICS_2022_6 = 518210 and show relevant columns
final_df = filtered_df[filtered_df['LOT_SPECIALIZED_OCCUPATION_NAME'].str.contains('analyst', case=False, na=False)]
final_df[['STATE_NAME', 'NAICS2_NAME', 'NAICS_2022_6', 'LOT_SPECIALIZED_OCCUPATION_NAME']].head(100)
STATE_NAME NAICS2_NAME NAICS_2022_6 LOT_SPECIALIZED_OCCUPATION_NAME
2 Texas Finance and Insurance 524291.0 Data Analyst
4 California Unclassified Industry 999999.0 Oracle Consultant / Analyst
9 New York Professional, Scientific, and Technical Services 541511.0 Data Analyst
10 California Wholesale Trade 423830.0 Data Analyst
15 Massachusetts Educational Services 611310.0 Data Analyst
... ... ... ... ...
294 Florida Educational Services 611310.0 SAP Analyst / Admin
295 California Finance and Insurance 524114.0 Data Analyst
296 New York Unclassified Industry 999999.0 General ERP Analyst / Consultant
297 Texas Professional, Scientific, and Technical Services 541611.0 SAP Analyst / Admin
299 Texas Professional, Scientific, and Technical Services 541511.0 General ERP Analyst / Consultant

100 rows × 4 columns

import matplotlib.pyplot as plt
import numpy as np

# Group by STATE_NAME and count jobs for NAICS_2022_6 = 518210
state_counts_jobs = final_df.groupby('STATE_NAME').size().reset_index(name='job_count')

# Sort state_counts_jobs from greatest to least by job_count
state_counts_jobs_sorted = state_counts_jobs.sort_values(by='job_count', ascending=False)

# Plot column chart
plt.figure(figsize=(8, 5))
colors = plt.cm.coolwarm(np.linspace(0, 1, len(state_counts_jobs_sorted)))
plt.bar(state_counts_jobs_sorted['STATE_NAME'], state_counts_jobs_sorted['job_count'], color=colors)
plt.xlabel('State')
plt.ylabel('Number of Jobs')
plt.title('Tech Jobs by State (Job Title Contains "Analyst")')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('figures/states_analyst_jobs.png', dpi=300, bbox_inches='tight')
plt.show()

import matplotlib.pyplot as plt
import numpy as np

# Group by STATE_NAME and count jobs for analysts
city_counts_jobs = final_df.groupby('CITY_NAME').size().reset_index(name='job_count')

# Sort state_counts_jobs from greatest to least by job_count and get top 10
city_counts_jobs_sorted = city_counts_jobs.sort_values(by='job_count', ascending=False).head(10)

# Plot column chart
plt.figure(figsize=(10, 6))
colors = plt.cm.coolwarm(np.linspace(0, 1, len(city_counts_jobs_sorted)))
plt.bar(city_counts_jobs_sorted['CITY_NAME'], city_counts_jobs_sorted['job_count'], color=colors)
plt.xlabel('City')
plt.ylabel('Number of Jobs')
plt.title('Top 10 Cities - Tech Jobs (Job Title Contains "Analyst")')
plt.xticks(rotation=45)
plt.tight_layout()

# Save the figure to the figures folder
plt.savefig('figures/top_10_cities_analyst_jobs.png', dpi=300, bbox_inches='tight')

plt.show()

import plotly.express as px
import plotly.io as pio

# Create nationwide data - remove state filtering to include all states
try:
    # Use the original df (before state filtering) to get all states
    all_states_df = df[df['LOT_SPECIALIZED_OCCUPATION_NAME'].str.contains('analyst', case=False, na=False)]
except NameError:
    # If df not available, load it fresh
    import pandas as pd
    df = pd.read_csv("./data/lightcast_job_postings.csv")
    all_states_df = df[df['LOT_SPECIALIZED_OCCUPATION_NAME'].str.contains('analyst', case=False, na=False)]

# Group by all states and count jobs
all_state_counts = all_states_df.groupby('STATE_NAME').size().reset_index(name='job_count')
all_state_counts_sorted = all_state_counts.sort_values(by='job_count', ascending=False)

print("Top 10 states with most analyst jobs:")
print(all_state_counts_sorted.head(10))
print(f"\nTotal states with analyst jobs: {len(all_state_counts_sorted)}")

# Comprehensive state abbreviation mapping
state_abbrev_map = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
    'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
    'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
    'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
    'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
    'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
    'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
    'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
    'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY',
    'District of Columbia': 'DC'
}

# Add state abbreviations to the data
all_state_counts_sorted['state_abbrev'] = all_state_counts_sorted['STATE_NAME'].map(state_abbrev_map)

# Filter out any states that couldn't be mapped (in case of data issues)
mapped_states = all_state_counts_sorted.dropna(subset=['state_abbrev'])

print(f"\nStates successfully mapped: {len(mapped_states)}")
if len(mapped_states) < len(all_state_counts_sorted):
    unmapped = all_state_counts_sorted[all_state_counts_sorted['state_abbrev'].isna()]
    print("Unmapped states:")
    print(unmapped['STATE_NAME'].tolist())

# Create a choropleth map showing job counts for all states
fig = px.choropleth(
    mapped_states,
    locations='state_abbrev',
    color='job_count',
    locationmode='USA-states',
    color_continuous_scale='Greens',
    labels={'job_count': 'Number of Jobs', 'STATE_NAME': 'State'},
    hover_name='STATE_NAME',
    hover_data={'state_abbrev': False, 'job_count': True}
)

# Update layout with no title to maximize map space
fig.update_layout(
    geo_scope='usa',
    width=1000,
    height=700,
    margin=dict(t=5, b=5, l=5, r=5),  # Minimal margins on all sides
    geo=dict(
        projection_type='albers usa',
        showlakes=True,
        lakecolor='rgb(255, 255, 255)',
        bgcolor='rgba(0,0,0,0)'
    )
)

# Show the interactive map
fig.show()
Top 10 states with most analyst jobs:
        STATE_NAME  job_count
42           Texas       7084
4       California       6501
8          Florida       3206
31        New York       3056
12        Illinois       3045
45        Virginia       2989
32  North Carolina       2423
34            Ohio       2403
9          Georgia       2363
29      New Jersey       2289

Total states with analyst jobs: 51

States successfully mapped: 50
Unmapped states:
['Washington, D.C. (District of Columbia)']
Top 10 states with most analyst jobs:
        STATE_NAME  job_count
42           Texas       7084
4       California       6501
8          Florida       3206
31        New York       3056
12        Illinois       3045
45        Virginia       2989
32  North Carolina       2423
34            Ohio       2403
9          Georgia       2363
29      New Jersey       2289

Total states with analyst jobs: 51

States successfully mapped: 50
Unmapped states:
['Washington, D.C. (District of Columbia)']

Tech Jobs Nationwide (Job Title Contains “Analyst”)

Interactive map showing the distribution of analyst jobs across all US states.

# filter df for NAICS_2022_2 is 44 and is 51

# df_41 = df[df['NAICS_2022_2'].isin(['44'])]
# 
# df_41.head()
selected_naics = [11, 21, 22]
filtered_df = df[df['NAICS_2022_2'].isin(selected_naics)]

# Further filter for NAICS_2022_6 = 518210 and show relevant columns
# final_df = filtered_df[filtered_df['NAICS_2022_2'].str.contains('analyst', case=False, na=False)]
filtered_df[['STATE_NAME', 'NAICS_2022_2', 'NAICS_2022_2_NAME', 'LOT_SPECIALIZED_OCCUPATION_NAME']].head(100)
STATE_NAME NAICS_2022_2 NAICS_2022_2_NAME LOT_SPECIALIZED_OCCUPATION_NAME
376 North Carolina 21.0 Mining, Quarrying, and Oil and Gas Extraction Data Analyst
394 North Carolina 22.0 Utilities General ERP Analyst / Consultant
502 California 22.0 Utilities Business Analyst (General)
525 Nebraska 22.0 Utilities Business Analyst (General)
632 Massachusetts 22.0 Utilities Business Analyst (General)
... ... ... ... ...
8531 Michigan 21.0 Mining, Quarrying, and Oil and Gas Extraction Data Analyst
8553 Texas 22.0 Utilities Enterprise Architect
8682 Florida 11.0 Agriculture, Forestry, Fishing and Hunting General ERP Analyst / Consultant
8698 Colorado 22.0 Utilities Enterprise Architect
8702 Oregon 22.0 Utilities Enterprise Architect

100 rows × 4 columns

import plotly.express as px

analyst_df = df[df['LOT_SPECIALIZED_OCCUPATION_NAME'].str.contains('analyst', case=False, na=False)]
analyst_salary_df = analyst_df[analyst_df['SALARY'].notna()]

print(f"Total analyst jobs: {len(analyst_df)}")
print(f"Analyst jobs with salary data: {len(analyst_salary_df)}")

# Group by state and calculate metrics
bubble_data = analyst_salary_df.groupby('STATE_NAME').agg({
    'SALARY': 'mean',  # Average salary for bubble size
    'STATE_NAME': 'count'  # Count of jobs for y-axis
}).rename(columns={'STATE_NAME': 'job_count'})

bubble_data = bubble_data.reset_index()

print(f"\nStates with analyst salary data: {len(bubble_data)}")
print("\nTop 10 states by job count:")
print(bubble_data.sort_values('job_count', ascending=False).head(10))

# Filter to top 10 states by job count
top_10_states = bubble_data.sort_values('job_count', ascending=False).head(10)

# Create the bubble chart
fig = px.scatter(
    top_10_states,
    x='STATE_NAME',
    y='job_count',
    size='SALARY',
    color='SALARY',
    hover_name='STATE_NAME',
    hover_data={
        'SALARY': ':,.0f',
        'job_count': True,
        'STATE_NAME': False
    },
    labels={
        'SALARY': 'Average Salary ($)',
        'STATE_NAME': 'State',
        'job_count': 'Number of Jobs (Excludes Null Salaries)'
    },
    title='Top 10 States for Analyst Jobs (Bubble Size = Salary)',
    color_continuous_scale='Greens'
)

# Customize the layout
fig.update_layout(
    width=1200,
    height=700,
    xaxis_tickangle=-45,
    showlegend=True
)

# Update traces for better bubble appearance
fig.update_traces(
    marker=dict(
        sizemode='diameter',
        sizeref=2.0*max(top_10_states['SALARY'])/(15.**2),
        sizemin=4,
        line=dict(width=1, color='white')
    )
)

fig.show()